# stm-analysis.R
# Performs analysis on chosen model
require(stm)
rm(list=ls(all=TRUE))
load("Models-2015-12-22-k10.Rdata")
plotModels(chinSelect)
# Which run dominates numerically, based on plotModels? Select it 
chinPrevFit <- chinSelect$runout[[3]]

# Analysis on chosen model run . # Table: most frequent and exclusive terms (FREX): 
labelTopics(chinPrevFit, topics=NULL, n=10)[2]

# What's the size of each topic? 
colMeans(chinPrevFit$theta)
# This can also be shown graphically:
plot.STM(chinPrevFit,type="summary") #, xlim=c(0,.4))

# Most representative statements by topic
# Chinese 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=1)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=2)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=3)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=4)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=5)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=6)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=7)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=8)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=9)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$openend, n=10, topics=10)$docs[[1]] 

# English translation (manual, by students)
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=1)$docs[[1]]
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=2)$docs[[1]]
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=3)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=4)$docs[[1]]
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=5)$docs[[1]]
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=6)$docs[[1]]
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=7)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=8)$docs[[1]]
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=9)$docs[[1]] 
findThoughts(chinPrevFit, texts=meta$OpenEnd, n=10, topics=10)$docs[[1]]

# Identify and label the chosen topics
chosentopics4order <- c(10,7,3,1)
labelTopics(chinPrevFit, topics=chosentopics4order, n=10)
chosentopics4labelsorder <- c("Smog", 
                              "Glacier melt and sea-level rise     ", 
                              "Vehicle and industrial emissions", 
                              "Impact on humans")
colMeans(chinPrevFit$theta)[chosentopics4order]

# Control for Xi'an location due to balance statistics
meta$xian <- 0
meta$xian[meta$Site==1] <- 1
table(meta$Site, meta$xian)

# Table 1: FREX details
# Declining term frequencies 
for (i in chosentopics4order) 
{
  print(exp(1)^sort(tail(sort(chinPrevFit$beta$logbeta[[1]][i,])), decreasing=TRUE))
}

plot(c(1:10), exp(1)^sort(tail(sort(chinPrevFit$beta$logbeta[[1]][3,]),10), decreasing=TRUE))
lines(c(1:10), exp(1)^sort(tail(sort(chinPrevFit$beta$logbeta[[1]][3,]),10), decreasing=TRUE))
checkBeta(chinPrevFit, tolerance=.4)
chinPrevFit$vocab[258] ## smog
##
# Recall: chosentopics4order <- c(10,7,3,1)
plot(c(1:10), exp(1)^sort(tail(sort(chinPrevFit$beta$logbeta[[1]][10,]),10), decreasing=TRUE), type="line")
lines(c(1:10), exp(1)^sort(tail(sort(chinPrevFit$beta$logbeta[[1]][7,]),10), decreasing=TRUE))
lines(c(1:10), exp(1)^sort(tail(sort(chinPrevFit$beta$logbeta[[1]][3,]),10), decreasing=TRUE))
lines(c(1:10), exp(1)^sort(tail(sort(chinPrevFit$beta$logbeta[[1]][1,]),10), decreasing=TRUE))

# Figure 1: Topic prevalence over co-variates
prep <- estimateEffect(chosentopics4order ~ climchg+xian+Race, 
                       chinPrevFit,
                       meta=meta) # optional: uncertainty="Global" / "None" 

# Showing the experimental effect
plot.estimateEffect(prep,  #   main="Experiment, K=8, run #2",
                    covariate = "climchg", 
                    topics = chosentopics4order,
                    model=chinPrevFit, 
                    labeltype="custom",
                    custom.labels=chosentopics4labelsorder,
                    method="difference",
                    # verbose.labels=TRUE,
                    xlim=c(-.1,.1),
                    cov.value1=1, cov.value2=0,
                    xlab="       Air pollution        ...        Global warming")



# Linking theta parameters for analysis by air/climate separately
###
# Part 3: Closer inspection of textual answer
# Look at topical prevalence over humanmade variable. Use XLS for that. 
# Add topical prevalence (theta parameters) to "meta" data frame
meta$theta1 <- chinPrevFit$theta[,1]
meta$theta3 <- chinPrevFit$theta[,3]
meta$theta7 <- chinPrevFit$theta[,7]
meta$theta10 <- chinPrevFit$theta[,10]
# meta$theta8 <- chinPrevFit$theta[,8]
# and save it for XLS
write.table(meta, "dataset-thetas.txt", sep="\t") 
# Note: The first row of the XLS output will need to be moved one cell to the right. 

# What's the distribution of the theta parameters? Looks like beta. 
hist(meta$theta10, breaks=50, xlim=c(0,1))
hist(meta$theta7, breaks=50, xlim=c(0,1))
hist(meta$theta3, breaks=50, xlim=c(0,1))
hist(meta$theta1, breaks=50, xlim=c(0,1))

# Subset the data frames into metaclim and metaair
metaair <- meta[meta$airpol==1,]
length(metaair[[1]])
metaclim <- meta[meta$climchg==1,]
length(metaclim[[7]])

# Beta regressions. Air pollution. 
# install.packages("betareg")
library(betareg)
# Air pollution: Smog and causes in text
mod.air.smog    <-betareg(theta10 ~         Gender+    Age +Educ+xian+Race, data=metaair)
mod.air.causes  <-betareg(theta3  ~         Gender+    Age +Educ+xian+Race, data=metaair)

# Climate: Health, causes, SLR. 
mod.clim.glacier<-betareg(theta7  ~         Gender+    Age +Educ+xian+Race, data=metaclim)
mod.clim.causes <-betareg(theta3  ~         Gender+    Age +Educ+xian+Race, data=metaclim)
mod.clim.health <-betareg(theta1  ~         Gender+    Age +Educ+xian+Race, data=metaclim)

# summary(mod.air.smog)

table2 <- htmlreg(list(
  mod.air.smog,
  mod.air.causes ,
  mod.clim.glacier,
  mod.clim.causes,
  mod.clim.health
), file="table2.html")


###
# Overall regression: Is there an interaction effect between the experimental effect and age?
# on smog:
meta$ageXairpol = meta$Age * meta$airpol
interaction.age.10 <- betareg(theta10 ~ ageXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
# on cars/industry:
interaction.age.7 <- betareg(theta7 ~ ageXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
# on glaciers/slr:
interaction.age.3 <- betareg(theta3 ~ ageXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
# on health
interaction.age.1 <- betareg(theta1 ~ ageXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)

# details 
summary(interaction.age.10)
summary(interaction.age.10)
summary(interaction.age.10)
summary(interaction.age.10)

# What about education?
# on smog: 
meta$educXairpol = meta$Educ * meta$airpol
interaction.educ.10 <- betareg(theta10 ~ educXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
# on cars/industry: 
interaction.educ.7 <- betareg(theta7 ~ educXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
# on glaciers/slr:
interaction.educ.3 <- betareg(theta3 ~ educXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
# on health
interaction.educ.1 <- betareg(theta1 ~ educXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)

# What about gender?
meta$genderXairpol = (meta$Gender-1) * meta$airpol
interaction.gender.10 <- betareg(theta10 ~ genderXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
interaction.gender.7  <- betareg(theta7 ~ genderXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
interaction.gender.3  <- betareg(theta3 ~ genderXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)
interaction.gender.1  <- betareg(theta1 ~ genderXairpol+airpol +Gender+Age+Educ+xian+Race, data=meta)

table.S4 <- htmlreg(list(
  interaction.age.10,interaction.age.7, interaction.age.3, interaction.age.1,
  interaction.educ.10, interaction.educ.7, interaction.educ.3, interaction.educ.1,
  interaction.gender.10, interaction.gender.7, interaction.gender.3, interaction.gender.1), 
  digits=3,
  file="table_S4.html")

# End
###########################
